
* +++++++++++++++++++++
* CLEAN DATA ON
* MIGRATION ACROSS 
* COUNTIES
* +++++++++++++++++++++

/*
Kreiswanderungsmatrix
Source: acquired from Statistisches Bundesamt
*/

* globals
global data_raw_kws "${data_raw}/migration_matrix/"

* loop over all files by year
forval year = 2012/2017 {
	local files : dir "${data_raw_kws}/KWS mit Alter `year'/" files "*.xlsx"
	foreach file in `files' {
		di "`file'"
		import excel using "${data_raw_kws}/KWS mit Alter `year'//`file'" ,clear firstrow
		
		* focus on main variables and give them names
		keep Herkunftsland HLD HerkunftskreisSchlüssel HerkunftskreisKreistext ///
			Zielland ZLD ZielkreisSchlüssel ZielkreisKreistext Altersgruppen Zugezogene P-AF

		ren (Herkunftsland HLD HerkunftskreisSchlüssel HerkunftskreisKreistext ///
			Zielland ZLD ZielkreisSchlüssel ZielkreisKreistext Altersgruppen Zugezogene P-AF) ///
			(origin_state_name origin_state origin_ags origin_place_name ///
			dest_state_name dest_state dest_ags dest_place_name age_group ///
			to_origin to_origin_m to_origin_f to_origin_de to_origin_de_m to_origin_de_f ///
			to_origin_for to_origin_for_m to_origin_for_f ///
			to_dest to_dest_m to_dest_f to_dest_de to_dest_de_m to_dest_de_f ///
			to_dest_for to_dest_for_m to_dest_for_f)
		
		* destring numbers of moves
		destring origin_ags origin_state dest_ags dest_state to_origin* to_dest*, replace force	
		
		* focus on rows containing data
		keep if origin_state_name != ""
		
		* clean up age group variable
		replace age_group = "18_24" if age_group == "18 - 25"
		replace age_group = "25_29" if age_group == "25 - 30"
		replace age_group = "30_49" if age_group == "30 - 50"
		replace age_group = "50_64" if age_group == "50 - 65"	
		replace age_group = "65plus" if age_group == "65 und älter"
		replace age_group = "17oryounger" if age_group == "unter 18"	
	
		* in some very few instances rows appear to be in the data twice
		* in that case just take the sum of those rows
		collapse (rawsum) to_origin* to_dest* ///
			(first) origin_state* origin_place_name dest_state* dest_place_name, ///
			by(origin_ags dest_ags age_group)

		local state = origin_state[1] 
		tempfile file_`state'
		save `file_`state''
	}
	
	* combine files for each state (within year)
	use `file_1', clear
	forval state = 2/16{
		append using `file_`state''
	}

	save "${data_derived}/all_rows_combined.dta", replace
	
	* foreach combination of origin-destination the rows are essentially included twice
	* e.g. in the file for Berlin we see all the inflows and outflows 
	* but then in the other files we also see in and outflows with Berlin
	* sometimes these don't perfectly line up 
	* e.g. sometimes Stuttgart to Berlin 516 and in other data 518
	* we take the mean of these
	use "${data_derived}/all_rows_combined.dta", clear	
	drop if origin_state_name =="© Statistisches Bundesamt (Destatis), 2018"
	keep origin_ags dest_ags age_group to_origin* to_dest*

	ds to_origin* to_dest*
	local vars `r(varlist)'

	renvars to_origin* to_dest*, suff(_og)

	tempfile ready_to_merge 	
	save `ready_to_merge'

	ren *_og *
	ren (origin_ags dest_ags to_origin* to_dest*) ///
		(dest_ags origin_ags to_dest*_flipped to_origin*_flipped)
		
	merge 1:1 origin_ags dest_ags age_group using `ready_to_merge'
	
	* now take the mean over the values
	foreach var in `vars' {
		egen `var' = rowmean(`var'_og `var'_flipped)
	}
	
	* clean up and save
	keep origin_ags dest_ags age_group to_dest*
	drop *_og *_flipped 

	ren to_dest* n_moves*
	gen year = `year'
	order origin_ags dest_ags year, first

	save "${data_derived}/moves_`year'.dta", replace
	erase "${data_derived}/all_rows_combined.dta"
}

* combine files for all years
use "${data_derived}/moves_2012.dta", clear
forval year = 2013/2017 {
	append using "${data_derived}/moves_`year'.dta"
}

isid origin_ags dest_ags year age_group
sort origin_ags dest_ags year age_group

save "${data_derived}/all_moves.dta", replace

* erase individual files
forval year = 2012/2017 {
	erase "${data_derived}/moves_`year'.dta"
}

* focus on 2016-2017 and clean it up further: i.e. make balanced sample!
use "${data_derived}/all_moves.dta", clear

keep if inrange(year, 2016, 2017)
drop if origin_ags == dest_ags // these are typically not included but sometimes they are and then there just all zeros, so let's drop them

preserve
	bys origin_ags age_group year : keep if _n==1
	keep origin_ags age_group year
	tempfile all_ags_age
	save `all_ags_age'

	bys origin_ags : keep if _n==1 
	keep origin_ags
	ren origin_ags dest_ags

	cross using `all_ags_age'
	drop if origin_ags == dest_ags 
	tempfile all_combos
	save `all_combos'
restore

merge 1:1 origin_ags dest_ags age_group year using `all_combos', assert(2 3) keep(2 3)

ds n_moves*
foreach var of varlist `r(varlist)' {
	qui: replace `var' = 0 if _merge ==2
	assert `var' !=.
}
drop _merge

save "${data_derived}/moves_2016_2017.dta", replace
